library(rjags)
## Loading required package: coda
## Linked to JAGS 4.3.0
## Loaded modules: basemod,bugs
#library(rnoaa)
library(daymetr)
#devtools::install_github("EcoForecast/ecoforecastR")
library(ecoforecastR)
Read in Google Flu trends data:
gflu = read.csv("http://www.google.org/flutrends/about/data/flu/us/data.txt",skip=11)
time = as.Date(gflu$Date)
y = gflu$Massachusetts
plot(time,y,type='l',ylab="Flu Index",lwd=2,log='y')
Create random walk model in JAGS:
RandomWalk = "
model{
#### Data Model
for(t in 1:n){
y[t] ~ dnorm(x[t],tau_obs)
}
#### Process Model
for(t in 2:n){
x[t]~dnorm(x[t-1],tau_add)
}
#### Priors
x[1] ~ dnorm(x_ic,tau_ic)
tau_obs ~ dgamma(a_obs,r_obs)
tau_add ~ dgamma(a_add,r_add)
}
"
Create data list to pass into model:
data <- list(y=log(y),
n=length(y),
x_ic=log(1000),
tau_ic=100,
a_obs=1,
r_obs=1,
a_add=1,
r_add=1)
Set initial conditions using random samples from the actual data:
nchain = 3
init <- list()
for(i in 1:nchain){
y.samp = sample(y,length(y),replace=TRUE)
init[[i]] <- list(tau_add=1/var(diff(log(y.samp))),tau_obs=5/var(log(y.samp)))
}
Send the model, the data, and the initial conditions to JAGS, which will return the JAGS model object:
j.model <- jags.model (file = textConnection(RandomWalk),
data = data,
inits = init,
n.chains = 3)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 620
## Unobserved stochastic nodes: 622
## Total graph size: 1249
##
## Initializing model
Take a few samples and assess convergence:
## burn-in
jags.out <- coda.samples (model = j.model,
variable.names = c("tau_add","tau_obs"),
n.iter = 1000)
plot(jags.out)
Take additional samples. This time we include the full vector of X:
jags.out <- coda.samples (model = j.model,
variable.names = c("x","tau_add","tau_obs"),
n.iter = 10000)
Calculate confidence intervals, un-log-transform data, and plot:
time.rng = c(1,length(time)) ## adjust to zoom in and out
out <- as.matrix(jags.out)
x.cols <- grep("^x",colnames(out)) ## grab all columns that start with the letter x
ci <- apply(exp(out[,x.cols]),2,quantile,c(0.025,0.5,0.975)) ## model was fit on log scale
plot(time,ci[2,],type='n',ylim=range(y,na.rm=TRUE),ylab="Flu Index",log='y',xlim=time[time.rng])
## adjust x-axis label to be monthly if zoomed
if(diff(time.rng) < 100){
axis.Date(1, at=seq(time[time.rng[1]],time[time.rng[2]],by='month'), format = "%Y-%m")
}
ecoforecastR::ciEnvelope(time,ci[1,],ci[3,],col=ecoforecastR::col.alpha("lightBlue",0.75))
points(time,y,pch="+",cex=0.5)
Here we check histograms of the observation and process uncertainty, converted from precisions back into standard deviations, as well as the correlations between the variables:
hist(1/sqrt(out[,1]),main=colnames(out)[1])
hist(1/sqrt(out[,2]),main=colnames(out)[2])
plot(out[,1],out[,2],pch=".",xlab=colnames(out)[1],ylab=colnames(out)[2])
cor(out[,1:2])
## tau_add tau_obs
## tau_add 1.00000000 0.07905067
## tau_obs 0.07905067 1.00000000
They look normally distributed and uncorrelated. Good.
To look at how observation frequency affects data assimilation, convert 3 out of every 4 observations to NA (i.e. treat the data as approximately monthly) and refit the model.
# convert every 3 out of 4 observations to NA
x <- as.matrix(rep(c(NA, NA, NA, 1), times = 155)) #155 times 4 is 620, the length of y
y.monthly <-y*x
y.monthly <- as.vector(y.monthly) # jags model doesn't run without this conversion
plot(time,y.monthly, ylab="Flu Index",lwd=2, log='y')
Create new data object, and set initial conditions using random samples from this data:
data <- list(y=log(y.monthly),
n=length(y.monthly),
x_ic=log(1000),
tau_ic=100,
a_obs=1,
r_obs=1,
a_add=1,
r_add=1)
nchain = 3
init <- list()
for(i in 1:nchain){
y.samp = sample(y.monthly,length(y.monthly),replace=TRUE)
init[[i]] <- list(tau_add=1/var(diff(log(y.samp))),tau_obs=5/var(log(y.samp)))
}
Send the model, the data, and the initial conditions to JAGS, which will return the JAGS model object:
j.model.monthly <- jags.model (file = textConnection(RandomWalk),
data = data,
inits = init,
n.chains = 3)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 155
## Unobserved stochastic nodes: 1087
## Total graph size: 1249
##
## Initializing model
# Take a few samples and assess convergence:
jags.out.monthly <- coda.samples (model = j.model.monthly,
variable.names = c("tau_add","tau_obs"),
n.iter = 1000)
plot(jags.out.monthly)
Take additional samples. This time we include the full vector of X:
jags.out.monthly <- coda.samples (model = j.model.monthly,
variable.names = c("x","tau_add","tau_obs"),
n.iter = 10000)
Calculate confidence intervals, un-log-transform data, and plot using only the observations that inform the model:
time.rng = c(1,length(time)) ## adjust to zoom in and out
out <- as.matrix(jags.out.monthly)
x.cols <- grep("^x",colnames(out)) ## grab all columns that start with the letter x
ci <- apply(exp(out[,x.cols]),2,quantile,c(0.025,0.5,0.975)) ## model was fit on log scale
plot(time,ci[2,],type='n',ylim=range(y,na.rm=TRUE),ylab="Flu Index",log='y',xlim=time[time.rng])
## adjust x-axis label to be monthly if zoomed
if(diff(time.rng) < 100){
axis.Date(1, at=seq(time[time.rng[1]],time[time.rng[2]],by='month'), format = "%Y-%m")
}
ecoforecastR::ciEnvelope(time,ci[1,],ci[3,],col=ecoforecastR::col.alpha("lightBlue",0.75))
points(time,y.monthly,pch="+",cex=0.5)
Check for correlation between tau_obs and tau_add, and check histograms:
hist(1/sqrt(out[,1]),main=colnames(out)[1])
hist(1/sqrt(out[,2]),main=colnames(out)[2])
plot(out[,1],out[,2],pch=".",xlab=colnames(out)[1],ylab=colnames(out)[2])
cor(out[,1:2])
## tau_add tau_obs
## tau_add 1.00000000 -0.04515888
## tau_obs -0.04515888 1.00000000
Points included in model are in blue, points that were converted to NA are in red:
plot(time,ci[2,],type='n',ylim=range(y,na.rm=TRUE),ylab="Flu Index",log='y',xlim=time[time.rng])
## adjust x-axis label to be monthly if zoomed
if(diff(time.rng) < 100){
axis.Date(1, at=seq(time[time.rng[1]],time[time.rng[2]],by='month'), format = "%Y-%m")
}
ecoforecastR::ciEnvelope(time,ci[1,],ci[3,],col=ecoforecastR::col.alpha("lightBlue",0.75))
points.df <- as.data.frame(y.monthly)
points.df$color <- ifelse(is.na(points.df$y.monthly), "red", "blue")
points(time, y, pch="+", cex=.5, col=points.df$color)
* Compare the CI between the two runs.
The confidence interval when we include additional points is much narrower than when the model was given fewer points. The model with data gaps has evenly-spaced data gaps, but if the gaps were larger we would see the confidence intervals balloon around those gaps. This is because the state-space model takes into account the data points before and after the missing data, so the CI is largest in the middle of the data gap.
keep3.mat <- as.matrix(rep(c(1,1,1,NA), 155))
# keep 3 out of 4 observed points:
y.keep3 <- y * keep3.mat
# keep 3 out of 4 predicted points:
pred.keep3 <- ci[2,] * keep3.mat
plot(pred.keep3, y.keep3, ylim=c(0,15000), xlim=c(0,15000))
The plot of predicted vs observed values shows that observed values increase (i.e. the most extreme outliers), the values are predicted within a more conservative range, because they are informed by the surrounding data. In the above time-series plot, we see that the red points (NA) happen to be at some of the peaks of the data. Because they are at the peaks, by definition the surrounding values are lower, so the peaks won’t be accurately predicted.
The accuracy and precision of the state estimates varies by the value of y. At the more extreme highs, the full-data model has narrow confidence intervals (high precision) and high accuracy. In the reduced-data model, the high values do not fall within the wider confidence interval, representing low precision and low accuracy. At all other values, however, both models were very accurate, though the full-data model was more precise.
When we reduce the data volume, the estimates of tau_obs and tau_add increase. The estimates of tau_obs increase the most dramatically, from a standard deviation with a mean close to .12 to a mean close to .28. This makes sense, because observation uncertainty can decrease with more data. The process error, tau_add, does not necessarily decrease with more data, but in our case it does decrease from ~.28 to ~.21 when data is reduced.
Return to the original data and instead of removing 3/4 of the data remove the last 40 observations (convert to NA) and refit the model to make a forecast for this period.
y.fcast <- y
y.fcast[581:620] <- NA
# Create new data object, and set initial conditions using random samples from this data:
data <- list(y=log(y.fcast),
n=length(y.fcast),
x_ic=log(1000),
tau_ic=100,
a_obs=1,
r_obs=1,
a_add=1,
r_add=1)
nchain = 3
init <- list()
for(i in 1:nchain){
y.samp = sample(y.fcast,length(y.fcast),replace=TRUE)
init[[i]] <- list(tau_add=1/var(diff(log(y.samp))),tau_obs=5/var(log(y.samp)))
}
# Send the model, the data, and the initial conditions to JAGS, which will return the JAGS model object:
j.model.fcast <- jags.model (file = textConnection(RandomWalk),
data = data,
inits = init,
n.chains = 3)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 580
## Unobserved stochastic nodes: 662
## Total graph size: 1249
##
## Initializing model
# Take samples and assess convergence:
jags.out.fcast <- coda.samples (model = j.model.fcast,
variable.names = c("tau_add","tau_obs"),
n.iter = 1000)
plot(jags.out.fcast)
Looks good, take more samples:
jags.out.fcast <- coda.samples (model = j.model.fcast,
variable.names = c("x", "tau_add","tau_obs"),
n.iter = 1000)
time.rng = c(length(time)-80,length(time)) ## adjust to zoom in and out
out <- as.matrix(jags.out.fcast)
x.cols <- grep("^x",colnames(out)) ## grab all columns that start with the letter x
ci <- apply(exp(out[,x.cols]),2,quantile,c(0.025,0.5,0.975)) ## model was fit on log scale
plot(time,ci[2,],type='n',ylim=range(y,na.rm=TRUE),ylab="Flu Index",log='y',xlim=time[time.rng])
## adjust x-axis label to be monthly if zoomed
if(diff(time.rng) < 100){
axis.Date(1, at=seq(time[time.rng[1]],time[time.rng[2]],by='month'), format = "%Y-%m")
}
ecoforecastR::ciEnvelope(time,ci[1,],ci[3,],col=ecoforecastR::col.alpha("lightBlue",0.75))
points.df <- as.data.frame(y.fcast)
points.df$color <- ifelse(is.na(points.df$y.fcast), "red", "blue")
points(time, y, pch="+", cex=.7, col=points.df$color)
The model was accurate enough - almost every data point fell within the confidence interval - but the confidence interval was very imprecise. It quickly expanded to include almost every possible data value, making it pretty useless as a forecast.
This could be improved by adding covariates - the flu trends closely reflect the time of year, so temperature or day of year might help constrain the precision. The accuracy is already high, and would likely remain high if the model included more information.
Here we’re going to use the Daymet product to get daily weather estimates, and then use daily minimum temperature (Tmin) as the covariate in our influenza model
# Create the data object, and set initial conditions using random samples from this data:
data <- list(y=log(y),
n=length(y),
x_ic=log(1000),
tau_ic=100,
a_obs=1,
r_obs=1,
a_add=1,
r_add=1)
## grab weather data
df <- daymetr::download_daymet(site = "Boston",
lat = 42.36,
lon = -71.06,
start = 2003,
end = 2016,
internal = TRUE)$data
## Downloading DAYMET data for: Boston at 42.36/-71.06 latitude/longitude !
## Done !
df$date <- as.Date(paste(df$year,df$yday,sep = "-"),"%Y-%j")
data$Tmin = df$tmin..deg.c.[match(time,df$date)]
## fit the model
ef.out <- ecoforecastR::fit_dlm(model=list(obs="y",fixed="~ Tmin", n.iter=3000),data)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 1240
## Unobserved stochastic nodes: 627
## Total graph size: 3816
##
## Initializing model
names(ef.out)
## [1] "params" "predict" "model" "data"
The package returns a list with four elements. params and predict are both the same mcmc.list objects we get back from JAGS, only split between the parameters and the latent state variables, respectively, to make it easier to perform diagnostics and visualizations:
## parameter diagnostics
params <- window(ef.out$params,start=500) ## remove burn-in
plot(params)
summary(params)
##
## Iterations = 500:3000
## Thinning interval = 1
## Number of chains = 3
## Sample size per chain = 2501
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## betaIntercept 0.311861 0.081257 0.0009381 1.308e-02
## betaTmin -0.001928 0.001126 0.0000130 9.516e-05
## beta_IC 0.953916 0.011795 0.0001362 1.811e-03
## tau_add 26.213880 1.658527 0.0191472 2.293e-02
## tau_obs 232.441310 35.053138 0.4046778 1.425e+00
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## betaIntercept 0.151850 0.253214 0.305643 0.370171 4.755e-01
## betaTmin -0.004127 -0.002686 -0.001919 -0.001166 2.578e-04
## beta_IC 0.930252 0.945420 0.954759 0.962418 9.773e-01
## tau_add 23.089753 25.065177 26.193737 27.324786 2.957e+01
## tau_obs 170.144277 207.854186 230.396134 254.737172 3.072e+02
cor(as.matrix(params))
## betaIntercept betaTmin beta_IC tau_add
## betaIntercept 1.00000000 -0.67123729 -0.99379509 -0.04441499
## betaTmin -0.67123729 1.00000000 0.63362521 0.02697605
## beta_IC -0.99379509 0.63362521 1.00000000 0.04601513
## tau_add -0.04441499 0.02697605 0.04601513 1.00000000
## tau_obs -0.04049258 0.02502820 0.04153022 -0.04420498
## tau_obs
## betaIntercept -0.04049258
## betaTmin 0.02502820
## beta_IC 0.04153022
## tau_add -0.04420498
## tau_obs 1.00000000
pairs(as.matrix(params))
## confidence interval
time.rng = c(1,length(time)) ## adjust to zoom in and out
out <- as.matrix(ef.out$predict)
ci <- apply(exp(out),2,quantile,c(0.025,0.5,0.975))
plot(time,ci[2,],type='n',ylim=range(y,na.rm=TRUE),ylab="Flu Index",log='y',xlim=time[time.rng])
## adjust x-axis label to be monthly if zoomed
if(diff(time.rng) < 100){
axis.Date(1, at=seq(time[time.rng[1]],time[time.rng[2]],by='month'), format = "%Y-%m")
}
ecoforecastR::ciEnvelope(time,ci[1,],ci[3,],col=ecoforecastR::col.alpha("lightBlue",0.75))
points(time,y,pch="+",cex=0.5)
Let’s check the histograms for tau_obs and tau_add:
hist(1/sqrt(out[,1]),main=colnames(out)[1], breaks=150)
hist(1/sqrt(out[,2]),main=colnames(out)[2], breaks=150)
The confidence intervals around the model became much more narrow when compared to initial random-walk model, indicating that error has decreased. But the error estimates (from the histograms above) actually seem to have increased with the addition of the temperature parameter: values are now above .4. But the distribution is tightly clustered around the mean, indicating greater confidence in the true values of uncertainty/process error.
Comparing the estimated value of the process model at the next time-step (without process error) with the actual posterior estimate from the time-step can make it clear where the model isn’t doing well enough. This is because the process error includes the variability not explained by the process model, so a large process error can indicate an uninformative process model.
Beta_Tmin and Beta_IC are positively correlated with each other. This makes sense, because the initial conditions (Beta_IC) come from the flu data, and the flu trends are closely correlated with minimum temperature (Beta_Tmin). The correlation between temperature and flu trends may be due to the flu virus remainining in the air longer when the air is dry and cold (Sundell et al. 2016)
Beta_intercept is almost perfectly negatively correlated with Beta_IC. This is because they are essentially the same value - the intercept is when our time-point is zero, and initial conditions are the starting conditions for our model, which are also when t = 0. The observation and process errors are randomly distributed with respect to the betas, which is good - we don’t want the errors to be biased.
Repeat the process of forecasting the last 40 observations (convert to NA), this time using the DLM with temperature as a covariate
# Create data object, and set initial conditions using random samples from this data:
data <- list(y=log(y.fcast),
n=length(y.fcast),
x_ic=log(1000),
tau_ic=100,
a_obs=1,
r_obs=1,
a_add=1,
r_add=1)
## grab weather data
df <- daymetr::download_daymet(site = "Boston",
lat = 42.36,
lon = -71.06,
start = 2003,
end = 2016,
internal = TRUE)$data
## Downloading DAYMET data for: Boston at 42.36/-71.06 latitude/longitude !
## Done !
df$date <- as.Date(paste(df$year,df$yday,sep = "-"),"%Y-%j")
data$Tmin = df$tmin..deg.c.[match(time,df$date)]
## fit the model
ef.out.fcast <- ecoforecastR::fit_dlm(model=list(obs="y",fixed="~ Tmin", n.iter=20000, n.adapt=5000),data)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 1200
## Unobserved stochastic nodes: 667
## Total graph size: 3816
##
## Initializing model
names(ef.out.fcast)
## [1] "params" "predict" "model" "data"
## parameter diagnostics
params <- window(ef.out.fcast$params,start=500) ## remove burn-in
plot(params)
summary(params)
##
## Iterations = 500:20000
## Thinning interval = 1
## Number of chains = 3
## Sample size per chain = 19501
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## betaIntercept 0.296877 0.098132 4.057e-04 6.837e-03
## betaTmin -0.001674 0.001235 5.107e-06 5.697e-05
## beta_IC 0.956142 0.014282 5.905e-05 9.831e-04
## tau_add 26.607838 1.733035 7.165e-03 8.824e-03
## tau_obs 233.103533 34.591559 1.430e-01 4.865e-01
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## betaIntercept 0.109366 0.230673 0.294127 3.625e-01 4.939e-01
## betaTmin -0.004099 -0.002507 -0.001673 -8.338e-04 7.332e-04
## beta_IC 0.927332 0.946582 0.956536 9.658e-01 9.835e-01
## tau_add 23.322576 25.419415 26.568600 2.775e+01 3.010e+01
## tau_obs 172.515743 208.633777 230.415847 2.549e+02 3.079e+02
cor(as.matrix(params))
## betaIntercept betaTmin beta_IC tau_add
## betaIntercept 1.00000000 -0.71878252 -0.99545766 -0.05198638
## betaTmin -0.71878252 1.00000000 0.68740190 0.03491282
## beta_IC -0.99545766 0.68740190 1.00000000 0.05177143
## tau_add -0.05198638 0.03491282 0.05177143 1.00000000
## tau_obs 0.01892869 -0.01273321 -0.01897139 -0.03620493
## tau_obs
## betaIntercept 0.01892869
## betaTmin -0.01273321
## beta_IC -0.01897139
## tau_add -0.03620493
## tau_obs 1.00000000
pairs(as.matrix(params))
## confidence interval
time.rng = c(length(time)-80,length(time)) ## adjust to zoom in and out
out <- as.matrix(ef.out.fcast$predict)
ci <- apply(exp(out),2,quantile,c(0.025,0.5,0.975))
plot(time,ci[2,],type='n',ylim=range(y,na.rm=TRUE),ylab="Flu Index",log='y',xlim=time[time.rng])
## adjust x-axis label to be monthly if zoomed
if(diff(time.rng) < 100){
axis.Date(1, at=seq(time[time.rng[1]],time[time.rng[2]],by='month'), format = "%Y-%m")
}
ecoforecastR::ciEnvelope(time,ci[1,],ci[3,],col=ecoforecastR::col.alpha("lightBlue",0.75))
points.df <- as.data.frame(y.fcast)
points.df$color <- ifelse(is.na(points.df$y.fcast), "red", "blue")
points(time, y, pch="+", cex=.7, col=points.df$color)
time.rng = c(1,length(time)) ## adjust to zoom in and out
out <- as.matrix(jags.out)
x.cols <- grep("^x",colnames(out)) ## grab all columns that start with the letter x
ci <- apply(exp(out[,x.cols]),2,quantile,c(0.025,0.5,0.975)) ## model was fit on log scale
plot(time,ci[2,],type='n',ylim=range(y,na.rm=TRUE),ylab="Flu Index",log='y',xlim=time[time.rng])
#axis.Date(1, at=seq(time[time.rng[1]],time[time.rng[2]],by='month'), format = "%Y-%m")
ecoforecastR::ciEnvelope(time,ci[1,],ci[3,],col=ecoforecastR::col.alpha("lightGreen",0.75))
## confidence interval
out <- as.matrix(ef.out$predict)
ci <- apply(exp(out),2,quantile,c(0.025,0.5,0.975))
## adjust x-axis label to be monthly if zoomed
ecoforecastR::ciEnvelope(time,ci[1,],ci[3,],col=ecoforecastR::col.alpha("blue",0.65))
points(time, y, pch="+", cex=.7)
The DLM model improved the random walk - it increased the precision, evident by the fact that the green (random walk) is almost always inclusive of the blue (DLM). The accuracy increased as well: the high outliers were captured by the DLM but not the random walk.
The model could potentially be further improved by the inclusion of more data (either higher resolution, or from neighboring states with similar climates). If the relationship between temperature and flu counts is well-understood, it could be included in a mechanistic way, through an epidemiological model in which susceptibility or spread is influenced by temperature.
Works cited:
Sundell N et al. J Clin Virol. 2016 Nov;84:59-63. doi: 10.1016/j.jcv.2016.10.005. Epub 2016 Oct 7.